library(dplyr)
library(tidyr)
library(oceancolouR)
library(ggplot2)
library(mapdata)
library(patchwork)
library(DT)
library(grid)
library(scales) # for breaks_pretty in boxplot axis labels
# restrict matchups to these years (-Inf, Inf for no restrictions)
years <- c(-Inf,Inf)
# region bounds
lat_bounds <- c(39,63)
lon_bounds <- c(-71,-42)
max_dist <- 10000 # metres
max_depth <- 10 # in situ depth, metres
# maximum number of matchup pixels for each in situ record (within max_dist from in situ data)
max_matchups <- 100 # from the input file
max_matchups_new <- 25 # further restrictions to the output
# use the median of the "max_matchup_new" closest points? (or use the single closest point?)
use_median <- TRUE
olci_flags <- c("INVALID", "LAND", "TURBID", "ICE", "CLOUD1", "CLOUD2")
input_path <- "02_matchups/"
#*******************************************************************************
# READ AND FORMAT DATA
# use this to collect data from all sensors/variables to plot to the map later, color-coded by sensor
full_df <- data.frame(matrix(nrow=0, ncol=5), stringsAsFactors = FALSE)
colnames(full_df) <- c("Sensor", "Year", "DOY", "Latitude", "Longitude")
full_df$Sensor <- as.character(full_df$Sensor)
full_df[,2:3] <- lapply(full_df[,2:3], as.integer)
full_df[,4:5] <- lapply(full_df[,4:5], as.numeric)
data_list <- list()
plot_lm_list <- list()
plot_box_list <- list()
list_ind <- 1
num_cols <- length(unlist(all_variables[names(all_variables) %in% sensors]))
stat_df <- data.frame(matrix(nrow=length(sensors), ncol=10), stringsAsFactors = FALSE)
colnames(stat_df) <- c("Sensor", "Variable", "Flagged", "Years", "Intercept", "Slope", "R^2", "RMSE", "pvalue", "num_obs")
# loop through each sensor, and each of the variables for the sensor, and calculate statistics, fits, and create maps and plots
for (i in 1:length(sensors)) {
sensor <- sensors[i]
variables <- all_variables[[sensor]]
for (j in 1:length(variables)) {
variable <- variables[j]
input_fname <- gsub(" flagged", "", paste0("matchups_L3b_daily_", sensor, "_", variable, "_maxdist_", max_dist, "_maxdepth_", max_depth, "_maxmatchups_", max_matchups, ".csv"))
if (!file.exists(paste0(input_path, input_fname))) {next}
flagged <- grepl("flagged", sensor)
# this must be different because CHL-OC5 is written in the table as CHL.OC5
df_sat_var <- ifelse(variable=="CHL-OC5", "CHL.OC5", variable)
df <- read.csv(paste0(input_path, input_fname)) %>%
dplyr::mutate_if(is.numeric, round, digits = 3)
# assign the variable names to a generic name for easier use below
df$sat_var <- df[, paste0("sat_", df_sat_var)]
df$in_situ_var <- df[, "HPLC_CHLA"]
df <- df %>%
# remove bad data
dplyr::filter(is.finite(sat_var) & sat_var > 0 & is.finite(in_situ_var) & in_situ_var > 0) %>%
# restrict to a certain number of matchups for each day/pixel
dplyr::group_by(YEAR, DOY, LATDEC, LONGDEC, DEPTH) %>%
dplyr::slice_min(order_by = dist_to_matchup, n = max_matchups_new) %>%
dplyr::ungroup() %>%
# retrieve a single matchup for comparison to in situ data (either single closest, or median of closest pixels)
dplyr::arrange(., cruise_id, ID, DEPTH, LATDEC, LONGDEC, YEAR, DOY, HPLC_CHLA, HPLC_FUCOX, dist_to_matchup) %>%
dplyr::group_by(., cruise_id, ID, DEPTH, LATDEC, LONGDEC, YEAR, DOY, HPLC_CHLA, HPLC_FUCOX)
if (use_median) {
# use the median of all closest matchups
df <- df %>%
dplyr::summarize_all(., .funs = median, na.rm=TRUE) %>%
dplyr::select(-pancan_lon, -pancan_lat)
} else {
# OR use the closest matchup
df <- df %>%
dplyr::summarize_all(., .funs = dplyr::first)
}
df <- df %>%
dplyr::ungroup() %>%
dplyr::arrange(., YEAR, DOY)
# restrict to certain years
if (all(is.finite(years))) {
df <- df %>% dplyr::filter(between(YEAR, years[1], years[2]))
tmp_years <- c(max(min(df$YEAR, na.rm=TRUE),years[1]), min(max(df$YEAR, na.rm=TRUE), years[2]))
} else if (is.finite(years[1])) {
df <- df %>% dplyr::filter(YEAR >= years[1])
tmp_years <- c(max(min(df$YEAR, na.rm=TRUE),years[1]), max(df$YEAR, na.rm=TRUE))
} else if (is.finite(years[2])) {
df <- df %>% dplyr::filter(YEAR <= years[2])
tmp_years <- c(min(df$YEAR, na.rm=TRUE), min(max(df$YEAR, na.rm=TRUE), years[2]))
} else {
tmp_years <- range(df$YEAR, na.rm=TRUE)
}
# restrict lat/lon
df <- df %>%
dplyr::filter(between(LATDEC,lat_bounds[1],lat_bounds[2]), between(LONGDEC,lon_bounds[1],lon_bounds[2]))
# collect data for mapping later
full_df <- dplyr::bind_rows(full_df,
df %>%
dplyr::mutate(Sensor=sensor) %>%
dplyr::rename(Year=YEAR, Latitude=LATDEC, Longitude=LONGDEC) %>%
dplyr::select(Sensor, Year, DOY, Latitude, Longitude))
# FOR OLCI, RESTRICT BY FLAG VALUES
# All rows should be kept for printing and plotting, but flagged rows should not be used in the stats, linear regression, etc., and flagged values should be colored differently
if (startsWith(sensor, "OLCI") & flagged & length(olci_flags) > 0) {
df <- df %>%
dplyr::mutate(mask = rowSums(.[names(.) %in% paste0("MASK_", olci_flags)], na.rm=TRUE)) %>%
dplyr::mutate(mask = ifelse(mask > 0, 1, 0))
tmp_df_stats <- df %>% dplyr::filter(mask != 1)
} else {
df$mask <- rep(0, nrow(df))
tmp_df_stats <- df
}
# adjust the mask column - for non-flagged values, include info on season
df[df$mask==0 & df$DOY <= 181, "mask"] <- 2
df[df$mask==0 & df$DOY > 181, "mask"] <- 3
df$mask <- factor(df$mask)
#***********************************************************************
# CALCULATE STATS
errors <- vector_errors(as.numeric(unlist(df[,"HPLC_CHLA"])),
as.numeric(unlist(df[,paste0("sat_", df_sat_var)])))
df <- dplyr::bind_cols(df, round(errors,3))
mdf_lm <- lm(log10(sat_var) ~ log10(in_situ_var), data=tmp_df_stats)
stat_df[list_ind,] <- c(sensor, variable, flagged,
paste0(tmp_years, collapse="-"),
round(as.numeric(coef(mdf_lm)[1]),3),
round(as.numeric(coef(mdf_lm)[2]),3),
round(summary(mdf_lm)$r.squared,3),
round(rmse(log10(tmp_df_stats$sat_var), log10(tmp_df_stats$in_situ_var)),3),
formatC(lmp(mdf_lm), format="e", digits = 3),
nobs(mdf_lm))
#***********************************************************************
# CREATE LINEAR MODEL PLOT
p_lm <- ggplot(df) +
geom_point(aes(x=in_situ_var, y=sat_var, color=mask), size=1.5, alpha=0.7) +
# invisible dummy layer that will force the full range of colors for the legend, even if the points aren't on this graph
geom_point(data=data.frame(sat_var=1:3, in_situ_var=1:3, mask=as.factor(1:3), stringsAsFactors = FALSE), aes(x=in_situ_var, y=sat_var, color=mask), alpha = 0) +
# 1:1 line
geom_abline(slope=1, intercept=0) +
# linear model
geom_abline(slope=coef(mdf_lm)[2], intercept=coef(mdf_lm)[1], colour="red") +
# log scales
scale_x_continuous(limits=c(0.1,10), trans="log10") +
scale_y_continuous(limits=c(0.1,10), trans="log10") +
labs(x=paste0("In situ HPLC_CHLA"),
y=variable) +
theme_bw() +
scale_colour_manual(breaks = 1:3, values = c("black", "darkgreen", "red"), labels = c("Flagged", "Spring", "Fall")) +
theme(legend.position="right",
legend.title=element_blank(),
legend.text=element_text(size = 16)) +
annotation_custom(grobTree(textGrob(sensor, x=0.05, y=0.95, hjust=0, gp=gpar(fontsize=12)))) +
# adjust size of points in legend
guides(colour=guide_legend(override.aes=list(size=5)))
plot_lm_list[[list_ind]] <- p_lm
#***********************************************************************
# CREATE ERROR BOXPLOTS
p_box <- ggplot(df[df$mask != 1,], aes(x=factor(YEAR), y=percent_error, fill=factor(mask))) +
geom_boxplot(outlier.shape=21, outlier.size=2) +
# invisible dummy layer that will force the full range of colors for the legend, even if the points aren't on this graph
geom_point(data=data.frame(YEAR=tmp_years[2], percent_error=rep(0,2), mask=as.factor(2:3), stringsAsFactors = FALSE), aes(x=factor(YEAR), y=percent_error, fill=factor(mask)), alpha = 0) +
scale_y_continuous(limits=c(-200,2000), breaks=c(-100,-10,-1,1,10,100,1000), labels=c(-100,-10,-1,1,10,100,1000), trans="asinh") +
scale_x_discrete(breaks=breaks_pretty(n=round(diff(range(df$YEAR, na.rm=TRUE))/2))) +
scale_fill_manual(values=c("2"="#33EE55", "3"="#FF3333"), labels=c("Spring", "Fall")) +
geom_hline(yintercept=mean(unlist(df[df$mask==2, "percent_error"]), na.rm=TRUE),color='#11CC33',alpha=0.7) +
geom_hline(yintercept=mean(unlist(df[df$mask==3, "percent_error"]), na.rm=TRUE),color='#FF3333',alpha=0.7) +
theme_bw() +
labs(fill="Season", y=expression(paste('[',italic('Chla'),'] % error '))) +
theme(legend.position="right",
legend.title=element_text(size = 20),
legend.text=element_text(size = 16),
legend.key.size = unit(3,"line"),
axis.title.x=element_blank(),
axis.text.x=element_text(size=10,angle=45,colour='black',vjust=1, hjust=1),
axis.title.y=element_text(size=12,colour="black")) +
annotation_custom(grobTree(textGrob(sensor, x=0.05, y=0.95, hjust=0, gp=gpar(fontsize=12))))
plot_box_list[[list_ind]] <- p_box
list_ind <- list_ind + 1
}
}
Below are the statistics from each set of in situ / satellite matchups, restricted to satellite matchups within 10000 metres of the in situ data sampling location, and a depth of 10 metres.
Satellite data and distance to matchup are the median values of the 25 closest binned pixels to the in situ sampling location, within 10000 metres.
Flags used for OLCI: INVALID, LAND, TURBID, ICE, CLOUD1, CLOUD2
“Fall” points are defined as anything after day of year 181, and “Spring” points before that. Flagged points are in black, and have not been used in the model.
Percent error for each sensor and variable are plotted below. Green and red boxes represent spring and fall error respectively, and green and red lines represent the mean value of each.
Matchups are mapped below, colour-coded by sensor. Note that many matchups for different sensors might overlap.